
# coding: utf-8

# In[1]:

import pandas as pd
import time
import gc
from datetime import datetime
from collections import Counter


# In[2]:

# ---------------------
# IMPORT FILES
INPT = pd.read_csv('assg_namstand.csv',sep=",",encoding="ISO-8859-1",dtype=str)
INPT = INPT.fillna('')
TRGT = pd.read_csv('dg_namstand.csv',sep=",",encoding="ISO-8859-1",dtype=str)
TRGT = TRGT.fillna('')
print(list(INPT)); print(list(TRGT)); print('INTP --',len(INPT),' ','TRGT --',len(TRGT))


# In[3]:

def changeencode(data, cols):
    for col in cols:
        data[col] = data[col].encode('iso-8859-1').decode('utf-8')
        if (col+1)%50000 == 0:
            print(col)
    return data  


# In[4]:

cols = range(len(INPT))
INPT.Name = changeencode(INPT.Name,cols)
INPT.loc1 = changeencode(INPT.loc1,cols)
INPT.loc2 = changeencode(INPT.loc2,cols)


# In[5]:

cols = range(len(TRGT))
start_time = time.time()
TRGT.Name = changeencode(TRGT.Name,cols)
TRGT.loc1 = changeencode(TRGT.loc1,cols)
TRGT.loc2 = changeencode(TRGT.loc2,cols)
print('Encoding TRGT done in --- %s min ---' % ((time.time() - start_time)/60))


# In[6]:

print(INPT[:20])


# In[7]:

print(TRGT[:20])


# In[8]:

# Save encoding
temp1 = INPT
temp2 = TRGT


# In[ ]:




# In[9]:

# ---------------------
# Set variable names
# Input: KIPRIS / Target: Dataguide

INPT = INPT[['Name','standard_name','stem_name','kiprisid','loc1','loc2']].drop_duplicates().reset_index(drop=True)
INPT = INPT.rename(columns={'kiprisid':'key','stem_name':'stem'})
INPT = INPT.fillna('NoName')
INPT['nb'] = INPT.index

TRGT = TRGT[['Name','standard_name','stem_name','Symbol','loc1','loc2']].drop_duplicates().reset_index(drop=True)
TRGT = TRGT.rename(columns={'Symbol':'key','stem_name':'stem'})
TRGT = TRGT.fillna('NoName')
TRGT['nb'] = TRGT.index

print(list(INPT)); print(list(TRGT)); print('INTP --',len(INPT),' ','TRGT --',len(TRGT))

# merge by assg_id and gvkey to wku after matching <assg_id : gvkey>


# In[10]:

# ---------------------
# DEFAULT MATCH PARAMETERS
threshold = 110 # only accept possible matches with a score higher than this
                # note: score is sum of wts of words, wt = 100/word_frequency 
lowhold = 45    # if score is above this AND relative score > 100, then accept
                # relative score is 100*score / score_of_target_name
maxno = 5       # don't report if there are more than this no. above threshold AND no complete matches
# Original threshold: 110/45
# ---------------------
start_time = time.time()

# main global variables
#srcnames = INPT[['stem','nb']]
srcnames = TRGT[['stem','nb']]
srcnames = {k: g['stem'].tolist() for k,g in srcnames.groupby("nb")}

#srcko = INPT[['Name','nb']]
srcko = TRGT[['Name','nb']]
srcko = {k: g['nb'].tolist() for k,g in srcko.groupby('Name')}
#trgko = TRGT[['Name','standard_name']]
trgko = INPT[['Name','standard_name']]
trgko = {k: g['Name'].tolist() for k,g in trgko.groupby('standard_name')}

#srcfull = INPT[['standard_name','nb']]
srcfull = TRGT[['standard_name','nb']]
srcfull = {k: g['nb'].tolist() for k,g in srcfull.groupby('standard_name')}

#srckey =INPT[['key','nb']] # key might be gvkey or assg_nb 
srckey = TRGT[['key','nb']]
#trgkey = TRGT[['key','nb']]
trgkey = INPT[['key','nb']]

#srcloc1 = INPT[['nb','loc1']]
srcloc1 = TRGT[['nb','loc1']]
srcloc1 = {k: g['loc1'].tolist() for k,g in srcloc1.groupby('nb')}
#srcloc2 = INPT[['nb','loc2']]
srcloc2 = TRGT[['nb','loc2']]
srcloc2 = {k: g['loc2'].tolist() for k,g in srcloc2.groupby('nb')}

#srcscores = INPT[['stem','nb']]; print('srcscores --',len(srcscores)) # add score to this dataframe
srcscores = TRGT[['stem','nb']]; print('srcscores --',len(srcscores))

print('Time --- %s min ---' % ((time.time() - start_time)/60))


# In[11]:

print(trgko['PUNG YANG WOONSU CORP'][0])
print(srcko['가담산업'])
print(srcloc1[0])


# In[12]:

#-----------------------------------------------
# WORD TOKEN DICTIONARY

# process one line at a time
start_time = time.time()
wcounts=[] # container for word tokens appearing in the names

#INPT_stem = INPT['stem'].tolist()
TRGT_stem = TRGT['stem'].tolist()

#for stem in INPT_stem:
for stem in TRGT_stem:
    if stem == 'NoName': continue    
    words = stem.split() # parse name into array of words - specify delimeter if you want (e.g. stem.split(","))
    wcounts = wcounts + words # add word tokens to wcounts 
    
print('Word Token Dictionary Done --- %s min ---' % ((time.time() - start_time)/60)) # About 30 minutes


# In[13]:

#-----------------------------------------------            
# WORD TOKEN FREQUENCY DICTIONARY 

counts = [] 
start_time = time.time()
counts = Counter(wcounts)
for key in counts:    
    counts[key] =  100/counts[key]
    
print('Word Token Frequency Dictionary Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min


# In[14]:

print(counts['PUNG'])


# In[15]:

#-----------------------------------------------        
# SRCSCORES -- FOR THOSE UNMATCHED IN STEP 1

def calscore(data):
    try:
        words = data.split()
        sss = [counts[j] for j in words]
        total = sum(sss)
        return total
    except: 
        return 'error'

start_time = time.time()
srcscores['srcscore'] = srcscores['stem'].apply(lambda x: calscore(x))
srcscores = srcscores[~(srcscores.stem == 'NoName')]

srcscores_dict = {k: g['srcscore'].tolist() for k,g in srcscores[['nb','srcscore']].groupby("nb")}

print('SRCSCORES Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min
print(len(srcscores))


# In[16]:

print(srcscores[100:110])
print(srcscores_dict[100][0])


# In[17]:

#-----------------------------------------------        
# WORDID DICTIONARY -- FOR THOSE UNMATCHED IN STEP 1

wordid = {} # below
start_time = time.time()
for i,row in srcscores.iterrows():
    stem = row.stem
    nb = int(row['nb'])
    if stem == 'NoName': continue
        
    words = stem.split() # parse name into array of words - specify delimeter if you want (e.g. stem.split(","))
    
    # add id to the list of ids using that word (except for first)
    for word in words:
        if word in wordid:
            wordid[word].append(nb)
        else:
            wordid[word]=[nb]

print('WORDID Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min


# In[18]:

print(wordid['PUNG'])


# In[ ]:




# In[ ]:




# In[19]:

# ---------------------------------------------------------------
# MATCHING WITH TRGT

# one target at a time    
start_time = time.time()
perfect=[]; over_threshold=[]

#for j,line in TRGT.iterrows():
for j,line in INPT.iterrows():
    score={}; # declare containers
    target = line.stem.split() # word tokens in target stem
    trg_nb = line.nb
    stdname = line.standard_name
    koname = line.Name
    stem = line.stem
    loc1 = line.loc1
    loc2 = line.loc2
    
    # calcualte match score for the target 
    for tword in target:
        if tword in wordid:
            wt = counts[tword] # get the score of the tword
            ids = wordid[tword] # make list of all ids using tword
        else: continue
        for nb in ids: # update score
            if nb in score:
                score[nb]=score[nb]+wt
            else:
                score[nb]=wt

    # test each score
    # first, check for exact matches
    if srcko.get(koname, 0) != 0: 
        for id in srcko[koname]:
            if srcloc1[id][0] == loc1: # and srcloc2[id][0] == loc2
                temp = {'src_id':id,'src_name':koname,'trg_id':trg_nb,'trg_name':koname,'loc1':loc1}
                perfect.append(temp)
            else:
                continue
            
    else: # in case of no perfect match 
        for key, value in score.items():
            sc = srcscores_dict[key][0]
            relscore = 100*value/sc
            if value > threshold and srcloc1[key][0] == loc1:
                temp = {'src_id':key,'src_stem':srcnames[key][0],'trg_id':trg_nb,'trg_stem':stem,'loc1':loc1,'sc':sc,'v':value,'rsc':relscore}
                over_threshold.append(temp)
            else:
                if (relscore >= 100) and (value > lowhold) and srcloc1[key][0] == loc1: # original threshold for relscore = 100
                    temp = {'src_id':key,'src_stem':srcnames[key][0],'trg_id':trg_nb,'trg_stem':stem,'loc1':loc1,'sc':sc,'v':value,'rsc':relscore}
                    over_threshold.append(temp) 
                else:
                    continue
                    
    if j % 10000 == 0: print(j,'--- %s min ---' % ((time.time() - start_time)/60))
        
print('MATCHING Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min

perfect = pd.DataFrame(perfect)
over_threshold = pd.DataFrame(over_threshold)   


# In[20]:

### SAVE (KIPRIS to DataGuide) ###


# In[21]:

# Match Assignee to Compustat
perfect = pd.merge(perfect, srckey, left_on='src_id',right_on='nb',how='inner')
perfect = perfect[['key','src_name','trg_id','trg_name','loc1']]
perfect = perfect.rename(columns = {'key':'kiprisid'})
perfect = pd.merge(perfect, trgkey, left_on='trg_id',right_on='nb',how='inner')
perfect = perfect[['kiprisid','src_name','key','trg_name','loc1']]; perfect = perfect.rename(columns = {'key':'Symbol'})
print(perfect[:10]); print(len(perfect))


# In[22]:

# Match Assignee to Compustat
over_threshold = pd.merge(over_threshold, srckey, left_on='src_id',right_on='nb',how='inner')
over_threshold = over_threshold[['key','src_stem','trg_id','trg_stem','loc1','sc','v','rsc']] 
over_threshold = over_threshold.rename(columns = {'key':'kiprisid'})
over_threshold = pd.merge(over_threshold, trgkey, left_on='trg_id',right_on='nb',how='inner')
over_threshold = over_threshold[['kiprisid','src_stem','key','trg_stem','loc1','sc','v','rsc']]; over_threshold = over_threshold.rename(columns = {'key':'Symbol'})
print(over_threshold[:10]); print(len(over_threshold))


# In[23]:

# Match Assignee to Compustat
perfect.to_csv('matched_perfect_dg2k.csv',sep='*',encoding='utf-8')
over_threshold.to_csv('matched_scorebased_dg2k.csv',sep='*',encoding='utf-8')


# In[ ]:




# In[ ]:



